{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3",
   "language": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "source": [
    "# I - Consolidations of datasets"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "   timestamp_start  timestamp_end  qSQI_score  cSQI_score  sSQI_score  \\\n0         28800001       28804000        1.00        0.68        3.22   \n1         28804001       28808000        1.00        0.59        3.53   \n2         28808001       28812000        0.86        0.80        3.13   \n3         28812001       28816000        1.00        0.65        3.43   \n4         28816001       28820000        0.67        0.54        3.23   \n\n   kSQI_score  pSQI_score  basSQI_score  classif  classif_avg  patient  \n0       12.36        0.53          0.96        1          1.0   103001  \n1       15.39        0.51          0.92        1          1.0   103001  \n2       11.75        0.50          0.93        1          1.0   103001  \n3       13.83        0.50          0.95        1          1.0   103001  \n4       13.00        0.51          0.95        1          1.0   103001  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>timestamp_start</th>\n      <th>timestamp_end</th>\n      <th>qSQI_score</th>\n      <th>cSQI_score</th>\n      <th>sSQI_score</th>\n      <th>kSQI_score</th>\n      <th>pSQI_score</th>\n      <th>basSQI_score</th>\n      <th>classif</th>\n      <th>classif_avg</th>\n      <th>patient</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>28800001</td>\n      <td>28804000</td>\n      <td>1.00</td>\n      <td>0.68</td>\n      <td>3.22</td>\n      <td>12.36</td>\n      <td>0.53</td>\n      <td>0.96</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>103001</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>28804001</td>\n      <td>28808000</td>\n      <td>1.00</td>\n      <td>0.59</td>\n      <td>3.53</td>\n      <td>15.39</td>\n      <td>0.51</td>\n      <td>0.92</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>103001</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>28808001</td>\n      <td>28812000</td>\n      <td>0.86</td>\n      <td>0.80</td>\n      <td>3.13</td>\n      <td>11.75</td>\n      <td>0.50</td>\n      <td>0.93</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>103001</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>28812001</td>\n      <td>28816000</td>\n      <td>1.00</td>\n      <td>0.65</td>\n      <td>3.43</td>\n      <td>13.83</td>\n      <td>0.50</td>\n      <td>0.95</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>103001</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>28816001</td>\n      <td>28820000</td>\n      <td>0.67</td>\n      <td>0.54</td>\n      <td>3.23</td>\n      <td>13.00</td>\n      <td>0.51</td>\n      <td>0.95</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>103001</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "\nPatient count by classification:\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "         classif\npatient         \n103001       600\n111001     22661\n113001       900\n124001      1200",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>classif</th>\n    </tr>\n    <tr>\n      <th>patient</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>103001</th>\n      <td>600</td>\n    </tr>\n    <tr>\n      <th>111001</th>\n      <td>22661</td>\n    </tr>\n    <tr>\n      <th>113001</th>\n      <td>900</td>\n    </tr>\n    <tr>\n      <th>124001</th>\n      <td>1200</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "\nPatient average by classification:\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "          classif  classif_avg\npatient                       \n103001   0.555000     0.724195\n111001   0.348749     0.419415\n113001   0.525556     0.660980\n124001   0.115000     0.286792",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>classif</th>\n      <th>classif_avg</th>\n    </tr>\n    <tr>\n      <th>patient</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>103001</th>\n      <td>0.555000</td>\n      <td>0.724195</td>\n    </tr>\n    <tr>\n      <th>111001</th>\n      <td>0.348749</td>\n      <td>0.419415</td>\n    </tr>\n    <tr>\n      <th>113001</th>\n      <td>0.525556</td>\n      <td>0.660980</td>\n    </tr>\n    <tr>\n      <th>124001</th>\n      <td>0.115000</td>\n      <td>0.286792</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    }
   ],
   "source": [
    "patients = [103001, 111001, 113001, 124001]\n",
    "\n",
    "df_ml_conso = pd.read_csv('../../datasets/2_dataset_creation_4s/df_ml_{}_norm.csv'.format(patients[0]))\n",
    "df_ml_conso['patient'] = patients[0]\n",
    "\n",
    "for patient in patients[1:]:\n",
    "    df_temp = pd.read_csv('../../datasets/2_dataset_creation_4s/df_ml_{}_norm.csv'.format(patient))\n",
    "    df_temp['patient'] = patient\n",
    "    df_ml_conso = pd.concat([df_ml_conso,df_temp], axis=0)\n",
    "\n",
    "display(df_ml_conso.head())\n",
    "\n",
    "print('\\nPatient count by classification:')\n",
    "display(pd.pivot_table(df_ml_conso, values=['classif'], index='patient',aggfunc='count'))\n",
    "\n",
    "print('\\nPatient average by classification:')\n",
    "display(pd.pivot_table(df_ml_conso, values=['classif', 'classif_avg'], index='patient',aggfunc=np.mean))"
   ]
  },
  {
   "source": [
    "# II - Appying threshold of quality"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "\nPatient count:\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "         classif\npatient         \n103001       600\n111001     22661\n113001       900\n124001      1200",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>classif</th>\n    </tr>\n    <tr>\n      <th>patient</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>103001</th>\n      <td>600</td>\n    </tr>\n    <tr>\n      <th>111001</th>\n      <td>22661</td>\n    </tr>\n    <tr>\n      <th>113001</th>\n      <td>900</td>\n    </tr>\n    <tr>\n      <th>124001</th>\n      <td>1200</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "\nPatient average by classification:\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "          classif  classif_avg  classif_threshold\npatient                                          \n103001   0.555000     0.724195           0.561667\n111001   0.348749     0.419415           0.353162\n113001   0.525556     0.660980           0.534444\n124001   0.115000     0.286792           0.127500",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>classif</th>\n      <th>classif_avg</th>\n      <th>classif_threshold</th>\n    </tr>\n    <tr>\n      <th>patient</th>\n      <th></th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>103001</th>\n      <td>0.555000</td>\n      <td>0.724195</td>\n      <td>0.561667</td>\n    </tr>\n    <tr>\n      <th>111001</th>\n      <td>0.348749</td>\n      <td>0.419415</td>\n      <td>0.353162</td>\n    </tr>\n    <tr>\n      <th>113001</th>\n      <td>0.525556</td>\n      <td>0.660980</td>\n      <td>0.534444</td>\n    </tr>\n    <tr>\n      <th>124001</th>\n      <td>0.115000</td>\n      <td>0.286792</td>\n      <td>0.127500</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    }
   ],
   "source": [
    "# Setting a threshold : proportion of optimal by observation\n",
    "classif_threshold = 0.95\n",
    "\n",
    "df_ml_conso['classif_threshold'] = df_ml_conso['classif_avg'].apply(lambda x: 1 if x >= classif_threshold else 0)\n",
    "\n",
    "print('\\nPatient count:')\n",
    "display(pd.pivot_table(df_ml_conso, values=['classif'], index='patient',aggfunc='count'))\n",
    "\n",
    "print('\\nPatient average by classification:')\n",
    "display(pd.pivot_table(df_ml_conso, values=['classif', 'classif_avg', 'classif_threshold'], index='patient',aggfunc=np.mean))"
   ]
  },
  {
   "source": [
    "# III - Creation of a validation dataset"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_ml_conso_for_model, df_ml_conso_validation = train_test_split(df_ml_conso, test_size=0.2, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "   timestamp_start  timestamp_end  qSQI_score  cSQI_score  sSQI_score  \\\n0         28800001       28804000        1.00        0.68        3.22   \n1         28804001       28808000        1.00        0.59        3.53   \n2         28808001       28812000        0.86        0.80        3.13   \n3         28812001       28816000        1.00        0.65        3.43   \n4         28816001       28820000        0.67        0.54        3.23   \n\n   kSQI_score  pSQI_score  basSQI_score  classif  classif_avg  patient  \\\n0       12.36        0.53          0.96        1          1.0   103001   \n1       15.39        0.51          0.92        1          1.0   103001   \n2       11.75        0.50          0.93        1          1.0   103001   \n3       13.83        0.50          0.95        1          1.0   103001   \n4       13.00        0.51          0.95        1          1.0   103001   \n\n   classif_threshold  \n0                  1  \n1                  1  \n2                  1  \n3                  1  \n4                  1  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>timestamp_start</th>\n      <th>timestamp_end</th>\n      <th>qSQI_score</th>\n      <th>cSQI_score</th>\n      <th>sSQI_score</th>\n      <th>kSQI_score</th>\n      <th>pSQI_score</th>\n      <th>basSQI_score</th>\n      <th>classif</th>\n      <th>classif_avg</th>\n      <th>patient</th>\n      <th>classif_threshold</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>28800001</td>\n      <td>28804000</td>\n      <td>1.00</td>\n      <td>0.68</td>\n      <td>3.22</td>\n      <td>12.36</td>\n      <td>0.53</td>\n      <td>0.96</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>103001</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>28804001</td>\n      <td>28808000</td>\n      <td>1.00</td>\n      <td>0.59</td>\n      <td>3.53</td>\n      <td>15.39</td>\n      <td>0.51</td>\n      <td>0.92</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>103001</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>28808001</td>\n      <td>28812000</td>\n      <td>0.86</td>\n      <td>0.80</td>\n      <td>3.13</td>\n      <td>11.75</td>\n      <td>0.50</td>\n      <td>0.93</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>103001</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>28812001</td>\n      <td>28816000</td>\n      <td>1.00</td>\n      <td>0.65</td>\n      <td>3.43</td>\n      <td>13.83</td>\n      <td>0.50</td>\n      <td>0.95</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>103001</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>28816001</td>\n      <td>28820000</td>\n      <td>0.67</td>\n      <td>0.54</td>\n      <td>3.23</td>\n      <td>13.00</td>\n      <td>0.51</td>\n      <td>0.95</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>103001</td>\n      <td>1</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Shape : (25361, 12)\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "       timestamp_start  timestamp_end  qSQI_score  cSQI_score  sSQI_score  \\\n17340         69362103       69366102        0.80        0.96        5.16   \n6849          27396541       27400540        0.75        0.81       -1.39   \n1647           6588071        6592070        0.53        0.56        3.94   \n14508         58034004       58038003        0.33        0.55        0.78   \n7121          28484625       28488624        0.22        0.89       -0.77   \n\n       kSQI_score  pSQI_score  basSQI_score  classif  classif_avg  patient  \\\n17340       34.27        0.50          0.92        1          1.0   111001   \n6849         0.43        0.52          0.44        0          0.0   111001   \n1647        18.33        0.43          0.96        0          0.0   111001   \n14508        5.42        0.49          0.77        1          1.0   111001   \n7121         0.30        0.51          0.44        0          0.0   111001   \n\n       classif_threshold  \n17340                  1  \n6849                   0  \n1647                   0  \n14508                  1  \n7121                   0  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>timestamp_start</th>\n      <th>timestamp_end</th>\n      <th>qSQI_score</th>\n      <th>cSQI_score</th>\n      <th>sSQI_score</th>\n      <th>kSQI_score</th>\n      <th>pSQI_score</th>\n      <th>basSQI_score</th>\n      <th>classif</th>\n      <th>classif_avg</th>\n      <th>patient</th>\n      <th>classif_threshold</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>17340</th>\n      <td>69362103</td>\n      <td>69366102</td>\n      <td>0.80</td>\n      <td>0.96</td>\n      <td>5.16</td>\n      <td>34.27</td>\n      <td>0.50</td>\n      <td>0.92</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>111001</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>6849</th>\n      <td>27396541</td>\n      <td>27400540</td>\n      <td>0.75</td>\n      <td>0.81</td>\n      <td>-1.39</td>\n      <td>0.43</td>\n      <td>0.52</td>\n      <td>0.44</td>\n      <td>0</td>\n      <td>0.0</td>\n      <td>111001</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>1647</th>\n      <td>6588071</td>\n      <td>6592070</td>\n      <td>0.53</td>\n      <td>0.56</td>\n      <td>3.94</td>\n      <td>18.33</td>\n      <td>0.43</td>\n      <td>0.96</td>\n      <td>0</td>\n      <td>0.0</td>\n      <td>111001</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>14508</th>\n      <td>58034004</td>\n      <td>58038003</td>\n      <td>0.33</td>\n      <td>0.55</td>\n      <td>0.78</td>\n      <td>5.42</td>\n      <td>0.49</td>\n      <td>0.77</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>111001</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>7121</th>\n      <td>28484625</td>\n      <td>28488624</td>\n      <td>0.22</td>\n      <td>0.89</td>\n      <td>-0.77</td>\n      <td>0.30</td>\n      <td>0.51</td>\n      <td>0.44</td>\n      <td>0</td>\n      <td>0.0</td>\n      <td>111001</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Shape : (20288, 12)\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "       timestamp_start  timestamp_end  qSQI_score  cSQI_score  sSQI_score  \\\n17340         69362103       69366102        0.80        0.96        5.16   \n6849          27396541       27400540        0.75        0.81       -1.39   \n1647           6588071        6592070        0.53        0.56        3.94   \n14508         58034004       58038003        0.33        0.55        0.78   \n7121          28484625       28488624        0.22        0.89       -0.77   \n\n       kSQI_score  pSQI_score  basSQI_score  classification  \n17340       34.27        0.50          0.92               1  \n6849         0.43        0.52          0.44               0  \n1647        18.33        0.43          0.96               0  \n14508        5.42        0.49          0.77               1  \n7121         0.30        0.51          0.44               0  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>timestamp_start</th>\n      <th>timestamp_end</th>\n      <th>qSQI_score</th>\n      <th>cSQI_score</th>\n      <th>sSQI_score</th>\n      <th>kSQI_score</th>\n      <th>pSQI_score</th>\n      <th>basSQI_score</th>\n      <th>classification</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>17340</th>\n      <td>69362103</td>\n      <td>69366102</td>\n      <td>0.80</td>\n      <td>0.96</td>\n      <td>5.16</td>\n      <td>34.27</td>\n      <td>0.50</td>\n      <td>0.92</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>6849</th>\n      <td>27396541</td>\n      <td>27400540</td>\n      <td>0.75</td>\n      <td>0.81</td>\n      <td>-1.39</td>\n      <td>0.43</td>\n      <td>0.52</td>\n      <td>0.44</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>1647</th>\n      <td>6588071</td>\n      <td>6592070</td>\n      <td>0.53</td>\n      <td>0.56</td>\n      <td>3.94</td>\n      <td>18.33</td>\n      <td>0.43</td>\n      <td>0.96</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>14508</th>\n      <td>58034004</td>\n      <td>58038003</td>\n      <td>0.33</td>\n      <td>0.55</td>\n      <td>0.78</td>\n      <td>5.42</td>\n      <td>0.49</td>\n      <td>0.77</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>7121</th>\n      <td>28484625</td>\n      <td>28488624</td>\n      <td>0.22</td>\n      <td>0.89</td>\n      <td>-0.77</td>\n      <td>0.30</td>\n      <td>0.51</td>\n      <td>0.44</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Shape : (20288, 9)\n"
     ]
    }
   ],
   "source": [
    "df_ml_conso_validation = df_ml_conso_for_model.drop(['patient', 'classif', 'classif_avg'], axis=1)\n",
    "df_ml_conso_validation.rename(columns={df_ml_conso_validation.columns[-1]: \"classification\"}, inplace=True)\n",
    "\n",
    "for element in  [df_ml_conso, df_ml_conso_for_model, df_ml_conso_validation]:\n",
    "    display(element.head())\n",
    "    print('Shape : {}'.format(element.shape))\n",
    "\n",
    "df_ml_conso_validation.to_csv('../../datasets/3_ml_patients_consolidation/df_ml_conso_validation_norm_4s.csv', index=False)"
   ]
  },
  {
   "source": [
    "# IV - Equalisation of repartition by patient"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "\nPatient count:\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "         classif\npatient         \n103001       418\n111001     12864\n113001       676\n124001       238",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>classif</th>\n    </tr>\n    <tr>\n      <th>patient</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>103001</th>\n      <td>418</td>\n    </tr>\n    <tr>\n      <th>111001</th>\n      <td>12864</td>\n    </tr>\n    <tr>\n      <th>113001</th>\n      <td>676</td>\n    </tr>\n    <tr>\n      <th>124001</th>\n      <td>238</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "\nPatient average by classification:\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "          classif  classif_avg  classif_threshold\npatient                                          \n103001   0.495215     0.681547                0.5\n111001   0.493392     0.551143                0.5\n113001   0.494083     0.632800                0.5\n124001   0.453782     0.599956                0.5",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>classif</th>\n      <th>classif_avg</th>\n      <th>classif_threshold</th>\n    </tr>\n    <tr>\n      <th>patient</th>\n      <th></th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>103001</th>\n      <td>0.495215</td>\n      <td>0.681547</td>\n      <td>0.5</td>\n    </tr>\n    <tr>\n      <th>111001</th>\n      <td>0.493392</td>\n      <td>0.551143</td>\n      <td>0.5</td>\n    </tr>\n    <tr>\n      <th>113001</th>\n      <td>0.494083</td>\n      <td>0.632800</td>\n      <td>0.5</td>\n    </tr>\n    <tr>\n      <th>124001</th>\n      <td>0.453782</td>\n      <td>0.599956</td>\n      <td>0.5</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "\nProportion of each class\n1    7098\n0    7098\nName: classif_threshold, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "df_ml_conso_balanced = pd.DataFrame()\n",
    "\n",
    "for patient in patients:\n",
    "    df_class1 = df_ml_conso_for_model[(df_ml_conso_for_model['patient'] == patient) & (df_ml_conso_for_model['classif_threshold'] ==1)]\n",
    "    df_class0 = df_ml_conso_for_model[(df_ml_conso_for_model['patient'] == patient) & (df_ml_conso_for_model['classif_threshold'] ==0)]\n",
    "\n",
    "    if df_class1.shape[0] >= df_class0.shape[0]:\n",
    "        df_ml_conso_balanced = pd.concat([df_ml_conso_balanced,\n",
    "                              df_class0,\n",
    "                              df_class1.sample(df_class0.shape[0])]\n",
    "                            )\n",
    "    else:\n",
    "        df_ml_conso_balanced = pd.concat([df_ml_conso_balanced,\n",
    "                        df_class0.sample(df_class1.shape[0]),\n",
    "                        df_class1]\n",
    "                    )\n",
    "\n",
    "print('\\nPatient count:')\n",
    "display(pd.pivot_table(df_ml_conso_balanced, values=['classif'], index='patient',aggfunc='count'))\n",
    "\n",
    "print('\\nPatient average by classification:')\n",
    "display(pd.pivot_table(df_ml_conso_balanced, values=['classif', 'classif_avg', 'classif_threshold'], index='patient',aggfunc=np.mean))\n",
    "\n",
    "print('\\nProportion of each class')\n",
    "print(df_ml_conso_balanced['classif_threshold'].value_counts())"
   ]
  },
  {
   "source": [
    "# V - Export"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "     timestamp_start  timestamp_end  qSQI_score  cSQI_score  sSQI_score  \\\n360         57840063       57844062        0.80        0.55        4.08   \n584         58736181       58740180        0.29        0.70        2.65   \n483         58332142       58336141        0.89        0.54        4.13   \n335         57740044       57744043        0.89        0.55        4.28   \n300         57600006       57604007        0.73        0.55        3.65   \n\n     kSQI_score  pSQI_score  basSQI_score  classification  \n360       19.63        0.50          0.94               0  \n584        9.16        0.49          0.92               0  \n483       20.01        0.54          0.98               0  \n335       21.11        0.54          0.98               0  \n300       15.83        0.52          0.96               0  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>timestamp_start</th>\n      <th>timestamp_end</th>\n      <th>qSQI_score</th>\n      <th>cSQI_score</th>\n      <th>sSQI_score</th>\n      <th>kSQI_score</th>\n      <th>pSQI_score</th>\n      <th>basSQI_score</th>\n      <th>classification</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>360</th>\n      <td>57840063</td>\n      <td>57844062</td>\n      <td>0.80</td>\n      <td>0.55</td>\n      <td>4.08</td>\n      <td>19.63</td>\n      <td>0.50</td>\n      <td>0.94</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>584</th>\n      <td>58736181</td>\n      <td>58740180</td>\n      <td>0.29</td>\n      <td>0.70</td>\n      <td>2.65</td>\n      <td>9.16</td>\n      <td>0.49</td>\n      <td>0.92</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>483</th>\n      <td>58332142</td>\n      <td>58336141</td>\n      <td>0.89</td>\n      <td>0.54</td>\n      <td>4.13</td>\n      <td>20.01</td>\n      <td>0.54</td>\n      <td>0.98</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>335</th>\n      <td>57740044</td>\n      <td>57744043</td>\n      <td>0.89</td>\n      <td>0.55</td>\n      <td>4.28</td>\n      <td>21.11</td>\n      <td>0.54</td>\n      <td>0.98</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>300</th>\n      <td>57600006</td>\n      <td>57604007</td>\n      <td>0.73</td>\n      <td>0.55</td>\n      <td>3.65</td>\n      <td>15.83</td>\n      <td>0.52</td>\n      <td>0.96</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    }
   ],
   "source": [
    "df_ml_conso_balanced = df_ml_conso_balanced.drop(['patient', 'classif', 'classif_avg'], axis=1)\n",
    "df_ml_conso_balanced.rename(columns={df_ml_conso_balanced.columns[-1]: \"classification\"}, inplace=True)\n",
    "display(df_ml_conso_balanced.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_ml_conso_balanced.to_csv('../../datasets/3_ml_patients_consolidation/df_ml_conso_balanced_norm_4s.csv', index=False)"
   ]
  }
 ]
}